using <- function(...) {
libs <- unlist(list(...))
need <- libs <- libs[!unlist(lapply(libs, require, character.only = T))]
if(length(need) > 0){
install.packages(need, repos = "https://cloud.r-project.org")
need <- need[!unlist(lapply(need, require, character.only = T))]
if (length(need) > 0) {
if (!requireNamespace("BiocManager", quietly = T))
install.packages("BiocManager")
BiocManager::install(need)
}
}
lapply(libs, require, character.only = T)
}
using("tidyverse", "DESeq2", "apeglm", "gprofiler2", "fgsea", "msigdbr", "org.Hs.eg.db",
"limma", "patchwork", "ggrepel", "reactable", "shiny", "tippy", "ggdist", "pheatmap")# load data
load('lec6_1.rda')
# create DESeq object
dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~condition)
# what samples are we working with?
coldata| condition | |
|---|---|
| ctrl1 | ctrl |
| ctrl2 | ctrl |
| ctrl3 | ctrl |
| ctrl4 | ctrl |
| ctrl5 | ctrl |
| t1d1 | t1d |
| t1d2 | t1d |
| t1d3 | t1d |
How does normalization show up on the MA plot?
dds <- DESeq(dds)
par(mfrow = c(1, 2))
limma::plotMA(log2(counts(dds) + 1), array = 8, main = "raw")
abline(h = 0, col = "grey")
limma::plotMA(log2(counts(dds, normalized = T) + 1), array = 8, main = "normalized")
abline(h = 0, col = "grey")What is the RLE normalization doing?
ref <- rowMeans(log(cts))
colnames(cts) %>%
lapply(function(s) {
cnts <- cts[,s]
tibble(x = exp((log(cnts) - ref)[is.finite(ref) & cnts > 0]),
y = s)
}) %>%
bind_rows() %>%
ggplot(aes(x, y)) +
stat_histinterval(breaks = 100) +
scale_x_continuous(trans = 'log', breaks = c(.05,1,20)) +
annotation_logticks(sides = 'b') +
labs(x = 'Ratio', y = 'Sample')res <- results(dds)
head(res)## log2 fold change (MLE): condition t1d vs ctrl
## Wald test p-value: condition t1d vs ctrl
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000223972 0.186742 -1.650058 3.64464 -0.452735 0.650739
## ENSG00000227232 5.733825 0.497553 1.02417 0.485813 0.627100
## ENSG00000278267 1.101126 -2.080596 3.39879 -0.612158 0.540433
## ENSG00000243485 0.451844 1.561078 3.54424 0.440455 0.659607
## ENSG00000284332 0.000000 NA NA NA NA
## ENSG00000237613 0.000000 NA NA NA NA
## padj
## <numeric>
## ENSG00000223972 NA
## ENSG00000227232 0.855427
## ENSG00000278267 NA
## ENSG00000243485 NA
## ENSG00000284332 NA
## ENSG00000237613 NA
resLFC <- lfcShrink(dds, coef = 2, type = "apeglm", res = res)
# what does coef = 2 mean?
resultsNames(dds)## [1] "Intercept" "condition_t1d_vs_ctrl"
par(mfrow = c(1, 2))
DESeq2::plotMA(res, ylim = c(-5, 5), maiin = 'Original')
DESeq2::plotMA(resLFC, ylim = c(-5, 5), main = 'Shrunken')volc <- function(r, x, y, ttl) {
d <- as.data.frame(r) %>%
dplyr::rename(x = !!x, y = !!y) %>%
mutate(kind = case_when(abs(x) > 2 & y < .01 ~ 'DE',
abs(x) > 2 ~ 'big',
y < .01 ~ 'sig',
T ~ 'NS'),
y = -log10(y))
ct <- na.omit(d) %>%
dplyr::filter(kind == 'DE') %>%
mutate(up = x > 0) %>%
dplyr::count(up) %>%
mutate(x = ifelse(up, Inf, -Inf),
y = Inf,
h = as.numeric(up))
ggplot(d, aes(x, y, color = kind)) +
geom_vline(xintercept = c(-2, 2), linetype = 'dashed') +
geom_hline(yintercept = 2, linetype = 'dashed') +
geom_point(alpha = .5) +
geom_label(aes(x = x, y = y, label = n, hjust = h),
vjust = 1, data = ct, inherit.aes = F) +
scale_y_continuous() +
scale_color_manual(values = c('forestgreen', 'red2', 'grey30', 'royalblue')) +
labs(x = x, y = y, title = ttl)
}
volc(res, 'log2FoldChange','padj', 'Original') +
theme(legend.position = 'none') +
volc(resLFC, 'log2FoldChange', 'padj', 'Shrunken')Before delving further, let’s make sure the samples we’re comparing for DE make sense with a simple PCA
rld <- rlog(dds, blind = F)
pd <- plotPCA(rld, intgroup = "condition", ntop = 500, returnData = T)
data.frame(extra) %>%
rownames_to_column('name') %>%
merge(pd) %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = condition)) +
geom_text(aes(label = name)) +
labs(x = sprintf('PC1: %.1f%% variance', 100 * attr(pd, 'percentVar')[1]),
y = sprintf('PC2: %.1f%% variance', 100 * attr(pd, 'percentVar')[2])) +
coord_fixed()assay(rld) %>%
apply(1, sd) %>%
order(decreasing = TRUE) %>%
.[1:100] %>%
assay(rld)[.,] %>%
heatmap()`
extra`
How can we go about addressing this?
colData(dds) <- cbind(coldata, extra)
colData(dds)$condition <- factor(colData(dds)$condition)
design(dds) <- ~ sex + condition
dds <- DESeq(dds)
res.s <- results(dds)
design(dds) <- ~ sex + age + condition
dds <- DESeq(dds)
res.sa <- results(dds)`
`
# clear workspace
rm(list = setdiff(ls(), 'volc'))
# load data
load('lec6_2.rda')
# create deseq dataset object
dds <- DESeqDataSetFromMatrix(countData = round(cts),
colData = coldata,
design = ~ condition)
# run deseq2
dds <- DESeq(dds)
# output all results
res <- results(dds)`
`
What about the top 10?
t10 <- data.frame(res) %>%
rownames_to_column("gene") %>%
slice_min(padj, n = 10, with_ties = F) %>%
arrange(padj) %>%
pull(gene)
counts(dds, normalized = TRUE) %>%
data.frame() %>%
rownames_to_column('gene') %>%
dplyr::filter(gene %in% t10) %>%
pivot_longer(-gene, names_to = 'samp', values_to = 'ct') %>%
merge(rownames_to_column(data.frame(coldata), 'samp')) %>%
mutate(gene = factor(gene, t10)) %>%
ggplot(aes(x = condition, y = ct, color = condition)) +
geom_point() +
scale_y_log10() +
facet_wrap(.~gene)`
`
rd <- data.frame(res) %>%
na.omit() %>%
rownames_to_column('gene') %>%
mutate(gene = sub('\\..*', '', gene))
g <- gost(rd$gene[rd$padj < .05 & abs(rd$log2FoldChange) > .5],
organism = 'hsapiens',
custom_bg = rd$gene)
g$result %>%
dplyr::select(source, term_name, term_size, query_size,
intersection_size, p_value) %>%
arrange(source, p_value) %>%
mutate(p_value = formatC(p_value, digits = 3, format = 'g')) %>%
reactable()gostplot(g)`
`
We can additionally apply GSEA
# mapping Ensembl IDs to gene symbols
e2s <- AnnotationDbi::select(org.Hs.eg.db,
key = rd$gene,
columns = "SYMBOL",
keytype = "ENSEMBL") %>%
na.omit() %>%
deframe()
# use DE test statistic to rank genes
s <- rd %>%
mutate(symb = e2s[gene]) %>%
na.omit() %>%
group_by(symb) %>%
summarise(stat = mean(stat)) %>%
deframe()
# pathways
p <- msigdbr(species = "Homo sapiens", category = "H") %>%
mutate(gs_name = sub('^HALLMARK_', '', gs_name)) %>%
{split(.$gene_symbol, .$gs_name)}
# GSEA
r <- fgsea(pathways = p, stats = s, eps = 0.0)
r %>%
arrange(NES) %>%
mutate(pathway = fct_inorder(pathway)) %>%
ggplot(aes(x = pathway, y = NES)) +
geom_col(aes(fill = -log10(padj))) +
scale_fill_viridis_c() +
coord_flip() +
labs(x = "Pathway",
y = "Normalized enrichment score")We can peek at the top hit
plotEnrichment(p[[arrange(r, padj)$pathway[1]]], s) +
ggtitle(arrange(r, padj)$pathway[1])Or as a table
arrange(r, padj) %>%
dplyr::select(pathway, padj, ES, NES, size, leadingEdge) %>%
mutate_at(c('pathway', 'padj', 'ES', 'NES'), function(x) {
formatC(x, digits = 3, format = 'g')
}) %>%
reactable(
searchable = T,
highlight = T,
wrap = F,
resizable = T,
striped = T,
paginationType = "jump",
showPageSizeOptions = T,
defaultPageSize = 10,
columns = list(
leadingEdge = colDef(
html = T,
cell = function(value, index, name) {
value <- paste(value, collapse = ', ')
div(
style = "cursor: info;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;",
tippy(text = value, tooltip = value)
)
}
)
)
)